PM 566 Lab 5

Author

Dana Gonzalez

Setup in R

library(data.table)
library(dtplyr)
library (dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:data.table':

    between, first, last
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()     masks data.table::between()
✖ dplyr::filter()      masks stats::filter()
✖ dplyr::first()       masks data.table::first()
✖ lubridate::hour()    masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag()         masks stats::lag()
✖ dplyr::last()        masks data.table::last()
✖ lubridate::mday()    masks data.table::mday()
✖ lubridate::minute()  masks data.table::minute()
✖ lubridate::month()   masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second()  masks data.table::second()
✖ purrr::transpose()   masks data.table::transpose()
✖ lubridate::wday()    masks data.table::wday()
✖ lubridate::week()    masks data.table::week()
✖ lubridate::yday()    masks data.table::yday()
✖ lubridate::year()    masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
stations <- fread("https://noaa-isd-pds.s3.amazonaws.com/isd-history.csv")
stations[, USAF := as.integer(USAF)]
Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
stations[, USAF   := fifelse(USAF == 999999, NA_integer_, USAF)]
stations[, CTRY   := fifelse(CTRY == "", NA_character_, CTRY)]
stations[, STATE  := fifelse(STATE == "", NA_character_, STATE)]

stations <- stations[!is.na(USAF)]

stations[, n := 1:.N, by = .(USAF)]
stations <- stations[n == 1,][, n := NULL]
stations <- fread("https://noaa-isd-pds.s3.amazonaws.com/isd-history.csv")
stations[, USAF := as.integer(USAF)]
Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
stations[, USAF   := fifelse(USAF == 999999, NA_integer_, USAF)]
stations[, CTRY   := fifelse(CTRY == "", NA_character_, CTRY)]
stations[, STATE  := fifelse(STATE == "", NA_character_, STATE)]

stations <- stations[!is.na(USAF)]

stations[, n := 1:.N, by = .(USAF)]
stations <- stations[n == 1,][, n := NULL]
if(!file.exists("met_all.gz"))
  download.file(
    url = "https://raw.githubusercontent.com/USCbiostats/data-science-data/master/02_met/met_all.gz",
    destfile = "met_all.gz",
    method = "libcurl",
    timeout = 60
  )
met <- data.table::fread("met_all.gz")

merge(
  x     = met,      
  y     = stations, 
  by.x  = "USAFID",
  by.y  = "USAF", 
  all.x = TRUE,      
  all.y = FALSE
  ) |> nrow()
[1] 2377343
stations <- stations[!duplicated(stations$USAF), ]

met <- merge(
  x     = met,      
  y     = stations, 
  by.x  = "USAFID",
  by.y  = "USAF", 
  all.x = TRUE,      
  all.y = FALSE
  )
head(met[, c('USAFID', 'WBAN.y', 'STATE')], n = 4)
Key: <USAFID>
   USAFID WBAN.y  STATE
    <int>  <int> <char>
1: 690150  93121     CA
2: 690150  93121     CA
3: 690150  93121     CA
4: 690150  93121     CA

Question 1: Representative station for the US

What is the median station in terms of temperature, wind speed, and atmospheric pressure? Look for the three weather stations that best represent continental US using the quantile() function. Do these three coincide?

quantiles_temp <- quantile(met$temp, na.rm = TRUE)
quantiles_wind <- quantile(met$wind.sp, na.rm = TRUE)
quantiles_pressure <- quantile(met$atm.press, na.rm = TRUE)

quantiles_df <- data.frame(
  metric = rep(c("Temperature", "Wind Speed", "Pressure"), each = length(quantiles_temp)),
  quantile = rep(names(quantiles_temp), times = 3),
  value = c(quantiles_temp, quantiles_wind, quantiles_pressure))

quantiles_df
        metric quantile  value
1  Temperature       0%  -40.0
2  Temperature      25%   19.6
3  Temperature      50%   23.5
4  Temperature      75%   27.8
5  Temperature     100%   56.0
6   Wind Speed       0%    0.0
7   Wind Speed      25%    0.0
8   Wind Speed      50%    2.1
9   Wind Speed      75%    3.6
10  Wind Speed     100%   36.0
11    Pressure       0%  960.5
12    Pressure      25% 1011.8
13    Pressure      50% 1014.1
14    Pressure      75% 1016.4
15    Pressure     100% 1059.9
any(duplicated(names(met)))
[1] FALSE
names(met) <- make.unique(names(met))
median_temp <- met %>% filter(temp == 23.5)
median(median_temp$USAFID)
[1] 722171
median_wind <- met %>% filter(wind.sp == 2.1)
median(median_wind$USAFID)
[1] 722860
median_pressure <- met %>% filter(atm.press == 1014.1)
median(median_pressure$USAFID)
[1] 723564

The median stations in the US for temperature, wind speed, and atmospheric pressure are as follows: 722171, 722860, and 723564, respectively.

Question 2: Representative station per state

Just like the previous question, you are asked to identify what is the most representative, the median, station per state. This time, instead of looking at one variable at a time, look at the euclidean distance. If multiple stations show in the median, select the one located at the lowest latitude.

state_median_temp <- met %>%
  group_by(STATE) %>%
  summarize(median_temp = median(temp, na.rm = TRUE), .groups = 'drop')

state_median_temp
# A tibble: 48 × 2
   STATE median_temp
   <chr>       <dbl>
 1 AL           25.3
 2 AR           25.6
 3 AZ           29  
 4 CA           21.1
 5 CO           18.9
 6 CT           22.2
 7 DE           24.4
 8 FL           27  
 9 GA           26  
10 IA           21  
# ℹ 38 more rows
state_median_temp_usafid <- met %>%
  inner_join(state_median_temp, by = "STATE") %>%
  filter(temp == median_temp) %>%
  group_by(STATE) %>%
  summarize(median_usafid = median(USAFID, na.rm = TRUE), .groups = 'drop')

state_median_temp_usafid
# A tibble: 48 × 2
   STATE median_usafid
   <chr>         <dbl>
 1 AL           722031
 2 AR           723417
 3 AZ           723745
 4 CA           722970
 5 CO           724676
 6 CT           725046
 7 DE           724180
 8 FL           722053
 9 GA           720951
10 IA           725474
# ℹ 38 more rows
state_median_wind <- met %>%
  group_by(STATE) %>%
  summarize(median_wind = median(wind.sp, na.rm = TRUE), .groups = 'drop')

state_median_wind
# A tibble: 48 × 2
   STATE median_wind
   <chr>       <dbl>
 1 AL            1.5
 2 AR            2.1
 3 AZ            3.1
 4 CA            2.6
 5 CO            2.6
 6 CT            2.1
 7 DE            2.6
 8 FL            2.6
 9 GA            1.5
10 IA            2.6
# ℹ 38 more rows
state_median_wind_usafid <- met %>%
  inner_join(state_median_wind, by = "STATE") %>%
  filter(wind.sp == median_wind) %>%
  group_by(STATE) %>%
  summarize(median_usafid = median(USAFID, na.rm = TRUE), .groups = 'drop')

state_median_wind_usafid
# A tibble: 48 × 2
   STATE median_usafid
   <chr>         <dbl>
 1 AL           722238
 2 AR           723418
 3 AZ           722785
 4 CA           723762
 5 CO           724627
 6 CT           725045
 7 DE           724093
 8 FL           722055
 9 GA           721035
10 IA           725467
# ℹ 38 more rows
state_median_pressure <- met %>%
  group_by(STATE) %>%
  summarize(median_pressure = median(atm.press, na.rm = TRUE), .groups = 'drop')

state_median_pressure
# A tibble: 48 × 2
   STATE median_pressure
   <chr>           <dbl>
 1 AL              1015.
 2 AR              1014.
 3 AZ              1011.
 4 CA              1013.
 5 CO              1014.
 6 CT              1015.
 7 DE              1015.
 8 FL              1015.
 9 GA              1015.
10 IA              1015.
# ℹ 38 more rows
state_median_pressure_usafid <- met %>%
  inner_join(state_median_pressure, by = "STATE") %>%
  filter(atm.press == median_pressure) %>%
  group_by(STATE) %>%
  summarize(median_usafid = floor(median(USAFID, na.rm = TRUE)), .groups = 'drop')

state_median_pressure_usafid
# A tibble: 46 × 2
   STATE median_usafid
   <chr>         <dbl>
 1 AL           722269
 2 AR           723407
 3 AZ           722783
 4 CA           723810
 5 CO           724676
 6 CT           725046
 7 DE           724093
 8 FL           722105
 9 GA           722190
10 IA           725472
# ℹ 36 more rows

Question 3: In the middle?

For each state, identify what is the station that is closest to the mid-point of the state. Combining these with the stations you identified in the previous question, use leaflet() to visualize all ~100 points in the same figure, applying different colors for those identified in this question.

library(leaflet)
median_temp2 <- (unique(median_temp[,c("LAT","LON")])) 
leaflet(median_temp) |> 
  addProviderTiles('CartoDB.Positron') |> 
  addCircles(lat = ~LAT, lng = ~LON,
             opacity = 1, fillOpacity = 1, radius = 400, color = "lightblue")
library(leaflet)
median_wind2 <- (unique(median_wind[,c("LAT","LON")])) 
leaflet(median_wind) |> 
  addProviderTiles('CartoDB.Positron') |> 
  addCircles(lat = ~LAT, lng = ~LON,
             opacity = 1, fillOpacity = 1, radius = 400, color = "darkred")
library(leaflet)
median_pressure2 <- (unique(median_pressure[,c("LAT","LON")])) 
leaflet(median_pressure) |> 
  addProviderTiles('CartoDB.Positron') |> 
  addCircles(lat = ~LAT, lng = ~LON,
             opacity = 1, fillOpacity = 1, radius = 400, color = "goldenrod")
state_centroids <- median_temp %>%
  group_by(STATE) %>%
  summarise(
    longitude = mean(LON, na.rm = TRUE),
    latitude = mean(LAT, na.rm = TRUE)
  )
state_centroids
# A tibble: 37 × 3
   STATE longitude latitude
   <chr>     <dbl>    <dbl>
 1 AL        -86.5     32.7
 2 AR        -92.2     34.9
 3 AZ       -110.      31.7
 4 CA       -118.      35.3
 5 CO       -106.      39.1
 6 DE        -75.5     39.1
 7 FL        -82.7     29.5
 8 GA        -83.1     32.5
 9 IA        -93.4     41.8
10 IL        -88.7     40.2
# ℹ 27 more rows
library(sf)
Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
state_centroids_sf <- st_as_sf(state_centroids, coords = c("longitude", "latitude"), crs = 4326)
stations_sf <- st_as_sf(state_centroids, coords = c("longitude", "latitude"), crs = 4326)

closest_stations <- state_centroids_sf %>%
  rowwise() %>%
  mutate(closest_index = {
    distances <- st_distance(geometry, stations_sf)
    which.min(distances)
  }) %>%
  mutate(closest_station = stations_sf[closest_index, ]) %>%
  ungroup() %>%
  select(STATE, closest_station)

closest_stations
Simple feature collection with 37 features and 2 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -118.3323 ymin: 29.48505 xmax: -70.817 ymax: 44.92613
Geodetic CRS:  WGS 84
# A tibble: 37 × 3
   STATE closest_station$STATE             geometry
   <chr> <chr>                          <POINT [°]>
 1 AL    AL                    (-86.46786 32.68585)
 2 AR    AR                         (-92.15 34.917)
 3 AZ    AZ                     (-110.4338 31.6845)
 4 CA    CA                    (-118.3323 35.34662)
 5 CO    CO                    (-105.6824 39.12739)
 6 DE    DE                        (-75.467 39.133)
 7 FL    FL                    (-82.68293 29.48505)
 8 GA    GA                    (-83.07165 32.45818)
 9 IA    IA                     (-93.3842 41.78051)
10 IL    IL                    (-88.68583 40.20475)
# ℹ 27 more rows
# ℹ 1 more variable: closest_station$geometry <POINT [°]>
longitude <- st_coordinates(closest_stations)[, 1]
latitude <- st_coordinates(closest_stations)[, 2]

closest_stations2 <- data.frame(stations_sf$STATE, latitude, longitude)
closest_stations2
   stations_sf.STATE latitude  longitude
1                 AL 32.68585  -86.46786
2                 AR 34.91700  -92.15000
3                 AZ 31.68450 -110.43383
4                 CA 35.34662 -118.33231
5                 CO 39.12739 -105.68236
6                 DE 39.13300  -75.46700
7                 FL 29.48505  -82.68293
8                 GA 32.45818  -83.07165
9                 IA 41.78051  -93.38420
10                IL 40.20475  -88.68583
11                IN 40.17699  -86.60331
12                KS 37.75650  -98.58310
13                KY 37.22173  -84.10867
14                LA 31.13350  -92.14517
15                MD 39.11552  -77.22391
16                MI 43.12047  -84.65518
17                MN 44.92613  -93.88014
18                MO 38.27069  -92.90385
19                MS 33.82332  -90.34586
20                NC 35.52200  -78.65477
21                NE 41.15257  -97.89149
22                NH 43.08300  -70.81700
23                NJ 40.55849  -74.27941
24                NM 34.20217 -106.70009
25                NY 40.96000  -72.25200
26                OH 40.19533  -82.47037
27                OK 35.34819  -97.53448
28                PA 40.44144  -77.66333
29                SC 34.65354  -82.33462
30                SD 43.80530  -99.69620
31                TX 32.08531  -98.48281
32                UT 41.34595 -112.01700
33                VA 37.32974  -77.55341
34                VT 44.85483  -72.43100
35                WI 44.39987  -90.29389
36                WV 37.86700  -80.40000
37                WY 41.03700 -107.49300
library(leaflet)
closest_map <- leaflet() %>%
  addProviderTiles("CartoDB.Positron")
closest_map <- closest_map %>%
  addCircleMarkers(data = closest_stations2,
                   lat = ~latitude,
                   lng = ~longitude,
                   color = "lightblue",
                   radius = 10,
                   stroke = FALSE,
                   fillOpacity = 0.8,
                   label = ~"Closest Station")
closest_map <- closest_map %>%
  addCircleMarkers(data = state_centroids,
                   lat = ~latitude,
                   lng = ~longitude,
                   color = "goldenrod",
                   radius = 5,
                   stroke = FALSE,
                   fillOpacity = 0.8,
                   label = ~"Midpoint")
closest_map <- closest_map %>%
  addLegend(position = "bottomright", 
            colors = c("lightblue", "goldenrod"), 
            labels = c("Closest Station", "Midpoint"),
            title = "Legend")
closest_map

Question 4: Means of means

States’ average temperature.

average_temp_by_state <- met %>%
  group_by(STATE) %>%
  summarize(average_temperature = mean(temp, na.rm = TRUE)) %>%
  mutate(Classification = case_when(
    average_temperature < 20 ~ "Low",
    average_temperature >= 20 & average_temperature  < 25 ~ "Mid",
    average_temperature >= 25 ~ "High"
  ))
average_temp_by_state
# A tibble: 48 × 3
   STATE average_temperature Classification
   <chr>               <dbl> <chr>         
 1 AL                   26.2 High          
 2 AR                   26.2 High          
 3 AZ                   28.8 High          
 4 CA                   22.4 Mid           
 5 CO                   19.5 Low           
 6 CT                   22.3 Mid           
 7 DE                   24.6 Mid           
 8 FL                   27.5 High          
 9 GA                   26.5 High          
10 IA                   21.3 Mid           
# ℹ 38 more rows

Number of entries (records)

nrow(met)
[1] 2377343

Number of NA entries

total_na <- sum(is.na(met))
total_na
[1] 2927561

Number of stations

unique_stations <- length(unique(met$USAFID))
unique_stations
[1] 1595

Number of states included

unique_states <- length(unique(met$STATE))
unique_states
[1] 48

Mean temperature

mean_temp <- mean(met$temp, na.rm = TRUE)
mean_temp
[1] 23.59054

Mean windspeed

mean_wind <- mean(met$wind.sp, na.rm = TRUE)
mean_wind
[1] 2.459166

Mean atmospheric pressure

mean_pressure <- mean(met$atm.press, na.rm = TRUE)
mean_pressure
[1] 1014.164

Summary Table by Average Temperature Level

met <-met %>%
  mutate(temp_level = cut(temp,
                          breaks = quantile(temp, probs = seq(0, 1, 0.25),  na.rm = TRUE),
                          include.lowest = TRUE,
                          labels = c("Low", "Medium", "High", "Very High")))

state_avg_temp <-met |>
  group_by(STATE) |>
  summarize(
    avg_temp = mean(temp, na.rm = TRUE),
    avg_wind = mean(wind.sp, na.rm = TRUE),
    avg_press = mean(atm.press, na.rm = TRUE),
    num_entries = n(),
    num_na = sum(is.na(temp)),
    .groups = 'drop'
  )

summary_table <- met %>%
  filter(temp_level !="Very High") %>%
  group_by(temp_level) %>%
  summarise(
    number_of_states = n_distinct(STATE),
    mean_temp = mean(temp, na.rm = TRUE),
    mean_wind = mean(wind.sp, na.rm = TRUE),
    mean_pressure = mean(atm.press, na.rm = TRUE))

summary_table
# A tibble: 3 × 5
  temp_level number_of_states mean_temp mean_wind mean_pressure
  <fct>                 <int>     <dbl>     <dbl>         <dbl>
1 Low                      48      15.9      1.89         1016.
2 Medium                   48      21.7      2.23         1015.
3 High                     48      25.6      2.48         1014.